The purpose of this Jupyter notebook is to clearly give the reader access to as much of the analysis that we performed for this paper as possible. We hope that you find this code useful for understanding our work, and maybe useful for your own work.
We used Kallisto to map reads and estimate TPM counts and Sleuth to analyze the RNA-seq data.
However, because I like to make my own plots, and because I wanted to carry out extensive analysis (I mainly write in python), the results were transferred from R into this python pipeline.
This pipeline is built using Python > 3.5
In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tissue_enrichment_analysis as tea
import pyrnaseq_graphics as rsq
import scipy.stats as stats
import matplotlib as mpl
from IPython.core.display import HTML
# bokeh
import bokeh.charts
import bokeh.charts.utils
import bokeh.io
import bokeh.models
import bokeh.palettes
import bokeh.plotting
from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.embed import file_html
# Display graphics in this notebook
bokeh.io.output_notebook()
from scipy.stats import gaussian_kde
from scipy import odr
%matplotlib inline
# This enables high res graphics inline (only use with static plots (non-Bokeh))
# SVG is preferred, but there is a bug in Jupyter with vertical lines
%config InlineBackend.figure_formats = {'png', 'retina'}
import matplotlib.cm as cm
from matplotlib import rc
rc('text', usetex=True)
import matplotlib.patheffects as path_effects
rc('text', usetex=True)
rc('text.latex', preamble=r'\usepackage{cmbright}')
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']})
# Magic function to make matplotlib inline;
%matplotlib inline
# This enables SVG graphics inline.
%config InlineBackend.figure_formats = {'png', 'retina'}
# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2,
'axes.labelsize': 18,
'axes.titlesize': 18,
'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style("dark")
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
mpl.rcParams['legend.fontsize'] = 14
Set the q-value below which to consider genes differentially expressed
In [2]:
qval = .1 # qvalue from regression
First, I load all my data.
Briefly, remember that Sleuth calculates log-linear models of the form
$$ log(y_i) = \beta_0 + \sum\limits_{k\epsilon K}\beta_k \cdot x_k + \sum\limits_{k\epsilon K}\sum\limits_{l\epsilon L} \beta_{k, l} \cdot x_k \cdot x_l + ... $$and these linear models can be extended to have interactions, etc.
For our specific model, we chose a linear model with interactions, of the form:
$$ log(y_i) = \beta_0 + \beta_{\mathrm{Old Adult}} \cdot x_{\mathrm{Old Adult}} + \beta_{\mathrm{fog-2}} \cdot x_{\mathrm{fog-2}} + \beta_{\mathrm{fog-2, Old Adult}} \cdot x_{\mathrm{fog-2}} \cdot x_{\mathrm{Old Adult}} $$Therefore, I have to load the following files:
I will also load all the golden standards that we have generated from the literature.
Finally, I load all the tissue, phenotype, and GO dictionaries, the list of all transcription factors in C. elegans and specify a few parameters
In [3]:
output_aging = '../output/raw_aging_plots/'
output_genotype = '../output/raw_genotype_plots/'
output_interaction = '../output/raw_interaction_plots/'
output_sperm = '../output/raw_sperm_plots/'
# gene_lists from sleuth
# tpm vals for PCA
path = '../input/sleuth_results/'
# pos beta means high old adults
dfBetaA = pd.read_csv(path + "si2_aging_analysis.csv", comment='#')
dfBetaA.dropna(inplace=True)
# pos beta means high in fog2
dfBetaG = pd.read_csv(path + "si3_genotype_analysis.csv", comment='#')
dfBetaG.dropna(inplace=True)
# pos beta means high in fog2-aged
dfBetaAG = pd.read_csv(path + "si4_interaction_analysis.csv", comment='#')
dfBetaAG.dropna(inplace=True)
# likelihood ratio test results
dfLRT = pd.read_csv(path + "lrt.csv")
dfLRT.dropna(inplace=True)
# sort by target_id
dfBetaA.sort_values('target_id', inplace=True)
dfBetaA.reset_index(inplace=True)
dfBetaG.sort_values('target_id', inplace=True)
dfBetaG.reset_index(inplace=True)
dfBetaAG.sort_values('target_id', inplace=True)
dfBetaAG.reset_index(inplace=True)
dfLRT.sort_values('target_id', inplace=True)
dfLRT.reset_index(inplace=True)
# gold standard datasets
path = '../input/gold_standards/'
dfDaf12 = pd.read_csv(path + 'daf12genes.csv')
dfDaf16 = pd.read_csv(path + 'daf16genes.csv')
dfLund = pd.read_csv(path + 'lund_data.csv', header=None, names=['gene'])
dfEckley = pd.read_csv(path + 'eckley_data.csv', header=None, names=['gene'])
dfMurphyUp = pd.read_csv(path + 'murphy_data_lifespan_extension.csv')
dfMurphyDown = pd.read_csv(path + 'murphy_data_lifespan_decrease.csv')
dfHalaschek = pd.read_csv(path + 'Halaschek-Wiener_data.csv')
# gpcrs
dfGPCR = pd.read_csv(path + 'all_gpcrs.csv')
dfICh = pd.read_csv(path + 'select_ion_transport_genes.csv')
dfAxon = pd.read_csv(path + 'axonogenesis_genes.csv')
dfNP = pd.read_csv(path + 'neuropeptides.csv')
# gpcr is going to go into a gold standard fxn so add an 'origin' colmn
dfGPCR['origin'] = 'gpcrs'
dfICh['origin'] = 'select ion transport genes'
dfAxon['origin'] = 'axonogenesis genes'
dfNP['origin'] = 'neuropeptide genes'
frames = [dfGPCR, dfICh, dfAxon, dfNP]
dfTargets = pd.concat(frames)
dfTargets.columns = ['gene', 'effect']
# place all the gold standards in a single dataframe:
dfDaf12['origin'] = 'daf-12'
dfDaf16['origin'] = 'daf-16'
dfEckley['origin'] = 'Eckley'
dfLund['origin'] = 'Lund'
dfMurphyUp['origin'] = 'MurphyExt'
dfMurphyDown['origin'] = 'MurphyDec'
dfHalaschek['origin'] = 'Halaschek'
frames = [dfDaf12, dfDaf16, dfEckley, dfLund,
dfMurphyDown, dfMurphyUp, dfHalaschek]
dfGoldStandard = pd.concat(frames)
# from wormbase
dfLifespanGenes = pd.read_csv(path + 'lifespan gene list complete.csv')
tissue_df = tea.fetch_dictionary()
phenotype_df = pd.read_csv('../input/phenotype_ontology.csv')
go_df = pd.read_csv('../input/go_dictionary.csv')
# transcription factors:
tf = pd.read_csv('../input/tf_list.csv')
# some non-essential parameters
xticksize = 15.5
xlabelsize = 23
ylabelsize_volcano = 23
volcano_legendsize = 10
# plot size
ms = 4
In [4]:
teaA = tea.enrichment_analysis(dfBetaA[(dfBetaA.qval < qval)].ens_gene, tissue_df,
show=False)
teaG = tea.enrichment_analysis(dfBetaG[(dfBetaG.qval < qval)].ens_gene, tissue_df,
show=False)
teaAG = tea.enrichment_analysis(dfBetaAG[(dfBetaAG.qval < qval)].ens_gene, tissue_df,
show=False)
In [5]:
peaA = tea.enrichment_analysis(dfBetaA[(dfBetaA.qval < qval)].ens_gene, phenotype_df,
show=False)
peaG = tea.enrichment_analysis(dfBetaG[(dfBetaG.qval < qval)].ens_gene, phenotype_df,
show=False)
peaAG = tea.enrichment_analysis(dfBetaAG[(dfBetaAG.qval < qval)].ens_gene, phenotype_df,
show=False)
In [6]:
# GEA
geaA = tea.enrichment_analysis(dfBetaA[(dfBetaA.qval < qval)].ens_gene, go_df,
show=False)
geaG = tea.enrichment_analysis(dfBetaG[(dfBetaG.qval < qval)].ens_gene, go_df,
show=False)
geaAG = tea.enrichment_analysis(dfBetaAG[(dfBetaAG.qval < qval)].ens_gene, go_df,
show=False)
In [7]:
def fix_fonts(n, ax):
"""A function to set the fontsize for all labels."""
if type(n) != int:
raise ValueError('n must be an integer')
ax.xaxis.label.set_fontsize(n)
ax.yaxis.label.set_fontsize(n)
ax.title.set_fontsize(n)
Plot the TEA results for genes that change as animals age.
In [8]:
fig, ax = plt.subplots()
tea.plot_enrichment_results(teaA, save=False)
plt.title('Enriched Tissues in Aging-Related Genes')
plt.ylabel('')
fix_fonts(15, ax)
plt.savefig(output_aging + 'tea_aging.svg', transparent=True)
Plot the TEA results for genes that change between WT and fog-2 mutants
In [9]:
fig, ax = plt.subplots()
tea.plot_enrichment_results(teaG, save=False)
plt.title('Enriched Tissues in Genotype-Related Genes')
plt.ylabel('')
fix_fonts(15, ax)
plt.savefig(output_genotype + 'tea_genotype.svg', transparent=True)
Plot the TEA results for genes that have an interaction $\beta$ significantly different from 0
In [10]:
fig, ax = plt.subplots()
tea.plot_enrichment_results(teaAG, save=False)
plt.title('Enriched Tissues in Interacting Genes')
plt.ylabel('')
fix_fonts(15, ax)
plt.savefig(output_interaction + 'tea_interaction.svg', transparent=True)
In [11]:
fig, ax = plt.subplots()
tea.plot_enrichment_results(peaA, save=False, analysis='phenotype')
plt.title('Enriched Phenotypes in Aging-Related Genes')
plt.ylabel('')
fix_fonts(15, ax)
plt.savefig(output_aging + 'pea_aging.svg', transparent=True)
PEA for genotype gene set
In [12]:
fig, ax = plt.subplots()
tea.plot_enrichment_results(peaG, save=False, analysis='phenotype')
plt.title('Enriched Phenotypes in Genotype-Related Genes')
plt.ylabel('')
fix_fonts(15, ax)
plt.savefig(output_genotype + 'pea_genotype.svg', transparent=True)
PEA for interaction gene set:
In [13]:
fig, ax = plt.subplots()
tea.plot_enrichment_results(peaAG, save=False, analysis='phenotype')
plt.title('Enriched Phenotypes in Interacting Genes')
plt.ylabel('')
fix_fonts(15, ax)
plt.savefig(output_interaction + 'pea_interaction.svg', transparent=True)
In [14]:
fig, ax = plt.subplots()
tea.plot_enrichment_results(geaA, save=False, analysis='go')
plt.title('Enriched Terms in Aging-Related Genes')
plt.ylabel('')
fix_fonts(15, ax)
plt.savefig(output_aging + 'gea_aging.svg', transparent=True)
GEA for genotype gene set
In [15]:
fig, ax = plt.subplots()
tea.plot_enrichment_results(geaG, save=False, analysis='go')
plt.title('Enriched Terms in Genotype-Related Genes')
plt.ylabel('')
fix_fonts(15, ax)
plt.savefig(output_genotype + 'gea_genotype.svg', transparent=True)
GEA for interaction gene set
In [16]:
fig, ax = plt.subplots()
tea.plot_enrichment_results(geaAG, save=False, analysis='go')
plt.title('Enriched Terms in Interacting Genes')
plt.ylabel('')
fix_fonts(15, ax)
plt.savefig(output_interaction + 'gea_interaction.svg', transparent=True)
In [17]:
sig = (dfBetaA.qval < 0.1)
gold = (dfBetaA.ens_gene.isin(dfGoldStandard.gene))
found = dfBetaA[sig & gold].shape[0]
pval = stats.hypergeom.sf(found, dfBetaA.shape[0], dfGoldStandard.shape[0], dfBetaA[sig].shape[0])
s = 'There are {0} gold standard genes in the aging set. '
s += 'This number is enriched above background at a p-value of {1:.2g}'
print(s.format(found, pval))
Before we start plotting the volcano plots, we need to define a selector function and a downsampler function.
The selector function will take in a dataframe of genes, beta values and q-values and return two dataframes: a dataframe containing transcription factors and a dataframe with all other genes.
The downsampler function will randomly remove datapoints that are below a certain q-value threshold. The reason to do this is that plotting 15,000 points is not very easy or useful, particularly if all the points are falling within a small area. Therefore, to make the plot easier to understand and smaller in memory, we downsample it.
Again, the downsampling is done only for plotting purposes!
In [18]:
def selector(df):
"""A function to separate tfs from everything else"""
sig = (df.qval < 0.1)# & (dfBetaA.b.abs() > 0.5)
not_tf = (~df.target_id.isin(tf.target_id))
is_tf = (df.target_id.isin(tf.target_id))
to_plot_not = df[sig & not_tf]
to_plot_yes = df[sig & is_tf]
return to_plot_not, to_plot_yes
def downsampler(df, threshold=10**-10, frac=0.30):
"""A function to downsample a dataframe"""
if (frac < 0) or (frac > 1):
raise ValueError('frac must be between 0 and 1')
df = df[df.qval > threshold]
select = np.int(np.floor(df.shape[0]*frac))
downsampled = df.sample(select)
return downsampled
For each plot, we plot transcription factors in orange, and all other genes as green.
Note: If you compile this and it gives an error, just run it again multiple times until it works. Matplotlib sometimes hates on LaTex, but I guarantee you it always works. I don't know what makes matplotlib throw an error-it must be a failure to load a module.
In [20]:
# downsampling threshold
threshold = 10**-30
# figure out where the TFs are
to_plot_not, to_plot_yes = selector(dfBetaA)
# figure out what to downsample
to_plot_not_downsampled = downsampler(to_plot_not, threshold)
# plot everything not a TF and above a certain -logq value
plt.plot(to_plot_not[to_plot_not.qval < threshold].b,
-to_plot_not[to_plot_not.qval < threshold].qval.apply(np.log10), 'o',
color='#1a9641', alpha=.6, ms=ms,
label='D.E. Isoforms, {0}'.format(to_plot_not.shape[0]),)
# plot tfs
plt.plot(to_plot_yes.b, -to_plot_yes.qval.apply(np.log10), 'o',
color='#fc8d59', ms=ms, label='D.E. T.F. Isoforms, {0}'.format(to_plot_yes.shape[0]))
# plot the legend so that you don't get multiple labels
plt.legend(loc=(0.6,.7), frameon=True).set_path_effects([path_effects.Normal()])
# plot downsampled points that are below the downsampling sig. threshold
# and set zorder to 0 so they are behind everything else
plt.plot(to_plot_not_downsampled.b, -to_plot_not_downsampled.qval.apply(np.log10), 'o',
color='#1a9641', alpha=0.6, ms=ms, zorder=0)
plt.xticks([-8, -4, 0, 4, 8])
plt.yticks([0, 50, 100])
plt.xlabel(r'\beta_\mathrm{Aging}').set_path_effects([path_effects.Normal()])
plt.ylabel(r'-\log_{10}{Q}').set_path_effects([path_effects.Normal()])
plt.title('Differentially Expressed Isoforms in Aging Worms',
y=1.02).set_path_effects([path_effects.Normal()])
plt.savefig(output_aging + 'volcano_plot_aging.svg', transparent=False, bbox_inches='tight')
In [20]:
to_plot_not, to_plot_yes = selector(dfBetaG)
threshold=10**-20
to_plot_not_downsampled = downsampler(to_plot_not, threshold, frac=0.60)
# plot everything not a TF and above a certain -logq value
plt.plot(to_plot_not[to_plot_not.qval < threshold].b,
-to_plot_not[to_plot_not.qval < threshold].qval.apply(np.log10), 'o',
color='#1a9641', alpha=0.6, ms=ms,
label='D.E. Isoforms, {0}'.format(to_plot_not.shape[0]))
# plot tfs
plt.plot(to_plot_yes.b, -to_plot_yes.qval.apply(np.log10), 'o',
color='#fc8d59', ms=ms, label='D.E. T.F. Isoforms, {0}'.format(to_plot_yes.shape[0]))
# plot the legend so that you don't get multiple labels
plt.legend(loc=(0.6,.85), frameon=True, fontsize=12).set_path_effects([path_effects.Normal()])
# plot downsampled points that are below the downsampling sig. threshold
# and set zorder to 0 so they are behind everything else
plt.plot(to_plot_not_downsampled.b, -to_plot_not_downsampled.qval.apply(np.log10), 'o',
color='#1a9641', alpha=0.6, ms=ms, zorder=0)
plt.xticks([-8, -4, 0, 4, 8])
plt.yticks([0, 15, 30])
plt.xlabel(r'\beta_\mathrm{Genotype}').set_path_effects([path_effects.Normal()])
plt.ylabel(r'-\log_{10}{Q}').set_path_effects([path_effects.Normal()])
plt.title(r'Differentially Expressed Isoforms in \emph{fog-2} Worms',
y=1.02).set_path_effects([path_effects.Normal()])
plt.savefig(output_genotype + 'volcano_plot_genotype.svg', transparent=False)
In [21]:
title = r'Isoforms with Significant Interaction Coefficients'
# make temporary dataframes to plot
to_plot_not, to_plot_yes = selector(dfBetaAG)
# plot everything not a TF
plt.plot(to_plot_not.b, -to_plot_not.qval.apply(np.log10), 'o',
color='#1a9641', alpha=0.6, ms=ms,
label='D.E. Isoforms, {0}'.format(to_plot_not.shape[0]))
# plot TF
plt.plot(to_plot_yes.b, -to_plot_yes.qval.apply(np.log10), 'o',
color='#fc8d59', ms=ms,
label='D.E. T.F. Isoforms, {0}'.format(to_plot_yes.shape[0]))
#axes
plt.xticks([-12, -6, 0, 6, 12])
plt.yticks([0, 6, 12])
plt.xlabel(r'\beta_\mathrm{Genotype}').set_path_effects([path_effects.Normal()])
plt.ylabel(r'-\log_{10}{Q}').set_path_effects([path_effects.Normal()])
plt.title(title).set_path_effects([path_effects.Normal()])
plt.legend(loc=(0.6,.7), frameon=True, fontsize=12)
Out[21]:
In [22]:
# Aging Transcription Factors
ind1 = (dfBetaA.qval < qval)
ind2 = (dfBetaA.target_id.isin(tf.target_id))
inds = ind1 & ind2
x = dfBetaA[inds].sort_values('qval')
print(x.shape)
x.to_csv('../output/tf_aging.csv')
In [23]:
# Genotype Transcription Factors
ind1 = (dfBetaG.qval < qval)
ind2 = (dfBetaG.target_id.isin(tf.target_id))
inds = ind1 & ind2
y = dfBetaG[inds].sort_values('qval')
print(y.shape)
y.to_csv('../output/tf_genotype.csv')
In [24]:
# INteraction Transcription Factors
ind1 = (dfBetaAG.qval < qval)
ind2 = (dfBetaAG.target_id.isin(tf.target_id))
inds = ind1 & ind2
z = dfBetaAG[inds].sort_values('qval')
print(z.shape)
In [25]:
teaTFA = tea.enrichment_analysis(x.ens_gene, tissue_df, show=False)
teaTFG = tea.enrichment_analysis(y.ens_gene, tissue_df, show=False)
teaTFAG = tea.enrichment_analysis(z.ens_gene, tissue_df, show=False)
In [26]:
l = len('WBbt:0005829')
teaTFA.Tissue = teaTFA.Tissue.map(lambda x: str(x)[:-l-1])
teaTFG.Tissue = teaTFG.Tissue.map(lambda x: str(x)[:-l-1])
teaTFAG.Tissue = teaTFAG.Tissue.map(lambda x: str(x)[:-l-1])
teaTFA[(teaTFA.Expected > 2) &
(teaTFA.Observed > 2)][['Tissue', 'Expected', 'Observed', 'Q value']]
Out[26]:
In [27]:
teaTFG[(teaTFG.Expected > 2) & (teaTFG.Observed > 2)][['Tissue', 'Expected', 'Observed', 'Q value']]
Out[27]:
In order to define the female transcriptome, we must:
The last bullet point should contain the most trustworthy points for the female transcriptome. It's also nice in the sense that the 'epistasis' is not enforced (as it is in the 3rd bullet point).
The 3rd bullet point checks whether the effects of aging/genotype are similar to each other. At the least, it establishes that these two variables act on the same phenotype (gene expression for specific genes), but it doesn't speak about genetic interaction (i.e., we still don't know how many of these interactions are additive or epistatic).
The 4th bullet point checks whether there is epistasis. I consider these points the most likely to conform to a female transcriptome.
We must reindex the dataframes, then we can find the genes that have $\beta$ different from 0 in both aging and genotype.
In [28]:
# reindex the dataframes
dfBetaA.sort_values('target_id', inplace=True)
dfBetaA.reset_index(inplace=True)
dfBetaG.sort_values('target_id', inplace=True)
dfBetaG.reset_index(inplace=True)
dfBetaAG.sort_values('target_id', inplace=True)
dfBetaAG.reset_index(inplace=True)
ind1 = (dfBetaA.qval < 0.1)
ind2 = (dfBetaG.qval < 0.1)
intersect = dfBetaA[ind1 & ind2]
Next, we can figure out how many genes are altered by aging, genotype, and how many of these genes are altered in BOTH conditions.
In [29]:
# find the number of genes that are D.E. in both conditions:
intersect.ens_gene.unique().shape
Out[29]:
In [30]:
# Number of genes that are D.E. in fog-2
dfBetaG[ind2].ens_gene.unique().shape
Out[30]:
In [31]:
# number of genes that are D.E. in aging
dfBetaA[ind1].ens_gene.unique().shape
Out[31]:
Finally, we deliver the coup de grace and set up all the conditions to find the female transcriptome.
In [32]:
# find the female state genes by identifying genes
# that change in the same direction for the B_aging and
# B_genotype coeffs, and have a non-zero value
# for the B_interaction coeff.
ind3 = dfBetaA.b*dfBetaG.b > 0
ind4 = dfBetaAG.qval < 0.1
coexpressed = dfBetaA[(ind1) & (ind2) & (ind3)]
female_state = dfBetaA[(ind1) & (ind2) & (ind3) & (ind4)]
aging_set = dfBetaA[(ind1) & (~dfBetaA.ens_gene.isin(female_state.ens_gene))]
genotype_nonfemale_set = dfBetaG[(ind2) & (~dfBetaG.ens_gene.isin(female_state.ens_gene))]
Print out the result.
In [33]:
print(
"""
{0} genes are coexpressed between aging and genotype trajectories.
{1} genes are coexpressed and have statistically significant interaction coefficients
""".format(coexpressed.ens_gene.unique().shape[0],
female_state.ens_gene.unique().shape[0]))
In [34]:
teaF = tea.enrichment_analysis(female_state.ens_gene.unique(), tissue_df=tissue_df, show=False)
peaF = tea.enrichment_analysis(female_state.ens_gene.unique(), tissue_df=phenotype_df, show=False)
geaF = tea.enrichment_analysis(female_state.ens_gene.unique(), tissue_df=go_df, show=False)
In [35]:
fig, ax = plt.subplots()
tea.plot_enrichment_results(peaF, save=False, analysis='phenotype')
plt.title('Enriched Phenotypes in Female State Genes')
plt.ylabel('')
fix_fonts(15, ax)
plt.savefig(output_interaction + 'pea_female.svg', transparent=True)
fig, ax = plt.subplots()
tea.plot_enrichment_results(geaF, save=False, analysis='go')
plt.title('Enriched Terms in Female State Genes')
plt.ylabel('')
fix_fonts(15, ax)
plt.savefig(output_interaction + 'gea_female.svg', transparent=True)
As a safety check, make sure that female-transcriptome genes are well annotated (i.e., the results above are not the result of biased curation)
In [36]:
melted_tissues = pd.melt(tissue_df, id_vars='wbid', var_name='term', value_name='expressed')
melted_tissues = melted_tissues[melted_tissues.expressed == 1]
melted_pheno = pd.melt(phenotype_df, id_vars='wbid', var_name='term', value_name='expressed')
melted_pheno = melted_pheno[melted_pheno.expressed == 1]
melted_go = pd.melt(go_df, id_vars='wbid', var_name='term', value_name='expressed')
melted_go = melted_go[melted_go.expressed == 1]
In [37]:
# Make sure these terms are well-annotated:
ind = melted_tissues.wbid.isin(female_state.ens_gene.unique())
print(
"""Female State Genes annotated with tissue information: {0}
""".format(melted_tissues[ind].wbid.unique().shape[0])
)
ind = melted_pheno.wbid.isin(female_state.ens_gene.unique())
print(
"""Female State Genes annotated with phenotype information: {0}
""".format(melted_pheno[ind].wbid.unique().shape[0])
)
ind = melted_go.wbid.isin(female_state.ens_gene.unique())
print(
"""Female State Genes annotated with GO information: {0}
""".format(melted_go[ind].wbid.unique().shape[0])
)
Save the results to a file
In [38]:
female_state.to_csv('../output/female_state.csv', index=False)
In [39]:
def make_expression_axes(tooltips, title,
xlabel, ylabel):
"""A function to plot the bokeh single mutant comparisons."""
# Make the hover tool
hover = bokeh.models.HoverTool(tooltips=tooltips,
names=['circles'])
# Create figure
p = bokeh.plotting.figure(title=title, plot_width=650,
plot_height=450)
p.xgrid.grid_line_color = 'white'
p.ygrid.grid_line_color = 'white'
p.xaxis.axis_label = xlabel
p.yaxis.axis_label = ylabel
# Add the hover tool
p.add_tools(hover)
return p
def add_points(p, df1, x, y, se_x, color='blue', alpha=0.2, outline=False):
# Define colors in a dictionary to access them with
# the key from the pandas groupby funciton.
df = df1.copy()
transformed_q = -df[y].apply(np.log10)
df['transformed_q'] = transformed_q
source1 = bokeh.models.ColumnDataSource(df)
# Specify data source
p.circle(x=x, y='transformed_q', size=7,
alpha=alpha, source=source1,
color=color, name='circles')
if outline:
p.circle(x=x, y='transformed_q', size=7,
alpha=1,
source=source1, color='black',
fill_color=None, name='outlines')
# prettify
p.background_fill_color = "#DFDFE5"
p.background_fill_alpha = 0.5
return p
def selector(df):
"""A function to separate tfs from everything else"""
sig = (df.qval < 0.1)# & (dfBetaA.b.abs() > 0.5)
not_tf = (~df.target_id.isin(tf.target_id))
is_tf = (df.target_id.isin(tf.target_id))
to_plot_not = df[sig & not_tf]
to_plot_yes = df[sig & is_tf]
return to_plot_not, to_plot_yes
In [40]:
import io as io
# What pops up on hover?
tooltips = [('ext_gene', '@ext_gene')]
p = make_expression_axes( tooltips, 'Aging Volcano Plot',
'Beta Coefficient (log-fold change)', '-log(Q)')
to_plot_not, to_plot_yes = selector(dfBetaA)
p = add_points(p, to_plot_not, 'b', 'qval', 'se_b', color='#1a9641')
p = add_points(p, to_plot_yes, 'b', 'qval', 'se_b', color='#fc8d59', alpha=0.6, outline=True)
html = file_html(p, CDN, "my plot")
with open('../output/aging_volcano.html', 'w') as f:
f.write(html)
In [41]:
# What pops up on hover?
tooltips = [('ext_gene', '@ext_gene')]
p = make_expression_axes( tooltips, 'Genotype Volcano Plot',
'Beta Coefficient (log-fold change)', '-log(Q)')
to_plot_not, to_plot_yes = selector(dfBetaG)
p = add_points(p, to_plot_not, 'b', 'qval', 'se_b', color='#1a9641')
p = add_points(p, to_plot_yes, 'b', 'qval', 'se_b', color='#fc8d59', alpha=0.6, outline=True)
with open('../output/genotype_volcano.html', 'w') as f:
f.write(html)
In [42]:
# What pops up on hover?
tooltips = [('ext_gene', '@ext_gene')]
p = make_expression_axes( tooltips, 'Aging::Genotype Volcano Plot',
'Beta Coefficient (log-fold change)', '-log(Q)')
to_plot_not, to_plot_yes = selector(dfBetaAG)
p = add_points(p, to_plot_not, 'b', 'qval', 'se_b', color='#1a9641')
p = add_points(p, to_plot_yes, 'b', 'qval', 'se_b', color='#fc8d59', alpha=0.6, outline=True)
html = file_html(p, CDN, "my plot")
# HTML(html)
with open('../output/interaction_volcano.html', 'w') as f:
f.write(html)
In [43]:
def compare_points(p, df_1, df_2, b='b', q='qval', se_b='se_b', color='blue',
alpha=0.2, outline=False, threshold=50):
"""A function to plot the b values between two dataframes against each other"""
# Define colors in a dictionary to access them with
# the key from the pandas groupby funciton.
df1 = df_1.copy()
df1['b_2'] = df_2.b
transformed_q = -df1[q].apply(np.log10)
transformed_q[transformed_q > threshold] = threshold
cols = [
"#%02x%02x%02x" % (int(r),
int(g), int(b)) for r, g, b, _ in
255*mpl.cm.viridis(mpl.colors.Normalize()(
transformed_q))
]
df1['colors'] = cols
source1 = bokeh.models.ColumnDataSource(df1)
# Specify data source
p.circle(x='b', y='b_2',
color='colors',
source=source1, size=7,
alpha=alpha, name='circles')
# prettify
p.background_fill_color = "#DFDFE5"
p.background_fill_alpha = 0.5
return p
In [44]:
# sort by target_id
dfBetaA.sort_values('target_id', inplace=True)
dfBetaA.reset_index(inplace=True, drop=True)
dfBetaG.sort_values('target_id', inplace=True)
dfBetaG.reset_index(inplace=True, drop=True)
dfBetaAG.sort_values('target_id', inplace=True)
dfBetaAG.reset_index(inplace=True, drop=True)
dfLRT.sort_values('target_id', inplace=True)
dfLRT.reset_index(inplace=True, drop=True)
In [45]:
indA = dfBetaA.target_id.isin(intersect.target_id)
indG = dfBetaG.target_id.isin(intersect.target_id)
# What pops up on hover?
tooltips = [('ext_gene', '@ext_gene')]
p = make_expression_axes(tooltips, 'Aging vs Genotype Volcano Plot',
'Beta Coefficient (Aging)', 'Beta Coefficient (Genotype)')
sig = (dfBetaA.qval < 0.1) & (dfBetaAG.qval < 0.1) & (dfBetaG.qval < 0.1)
p = compare_points(p, dfBetaA[sig], dfBetaG[sig], 'b',
'qval', 'se_b', color='gray', alpha=1)
html = file_html(p, CDN, "my plot")
with open('../output/aging_vs_genotype.html', 'w') as f:
f.write(html)
In [46]:
inag = dfBetaAG[dfBetaAG.qval < 0.1].target_id
ina = dfBetaA[dfBetaA.qval < 0.1].target_id
ing = dfBetaG[dfBetaG.qval < 0.1].target_id
ind1 = (dfBetaA.target_id.isin(inag)) & (dfBetaA.qval < 0.1) & (dfBetaA.target_id.isin(ing))
ind2 = (dfBetaAG.qval < 0.1) & (dfBetaAG.target_id.isin(ina)) & (dfBetaAG.target_id.isin(ing))
to_plot_not1, to_plot_yes1 = selector(dfBetaA[ind1])
to_plot_not2, to_plot_yes2 = selector(dfBetaAG[ind2])
p = make_expression_axes( tooltips, 'Aging vs Aging::Genotype Plot',
'Beta Coefficient (Aging)', 'Beta Coefficient (Aging::Genotype)')
# p = rectangle(p, 10, 0.1, -10, -0.01, -10, -0.01, 0.01, 10)
p = compare_points(p, dfBetaA[sig], dfBetaAG[sig], 'b',
'qval', 'se_b', color='gray', alpha=0.8)
html = file_html(p, CDN, "my plot")
with open('../output/aging_vs_interaction.html', 'w') as f:
f.write(html)
In [47]:
ind1 = (dfBetaG.target_id.isin(inag)) & (dfBetaG.qval < 0.1) & (dfBetaG.target_id.isin(ina))
ind2 = (dfBetaAG.qval < 0.1) & (dfBetaAG.target_id.isin(ina)) & (dfBetaAG.target_id.isin(ing))
to_plot_not1, to_plot_yes1 = selector(dfBetaG[ind1])
to_plot_not2, to_plot_yes2 = selector(dfBetaAG[ind2])
p = make_expression_axes( tooltips, 'Genotype vs Aging::Genotype Plot',
'Beta Coefficient (Genotype)', 'Beta Coefficient (Aging::Genotype)')
p = compare_points(p, to_plot_not1, to_plot_not2,
'b', 'qval', 'se_b', color='gray', alpha=0.5)
p = compare_points(p, to_plot_yes1, to_plot_yes2,
'b', 'qval', 'se_b', color='gray', alpha=0.7, outline=True)
html = file_html(p, CDN, "my plot")
with open('../output/genotype_vs_interaction.html', 'w') as f:
f.write(html)
In [48]:
threshold = 20
transformed_q = -dfBetaA[sig].qval.apply(np.log10)
transformed_q[transformed_q > threshold] = threshold
cols = [
"#%02x%02x%02x" % (int(r),
int(g), int(b)) for r, g, b, _ in
255*mpl.cm.viridis(mpl.colors.Normalize()(
transformed_q))]
x = np.linspace(-6, 6)
In [49]:
fig, ax = plt.subplots()
plot = plt.scatter(dfBetaA[sig].b, dfBetaG[sig].b,
c=transformed_q, cmap='magma', alpha=.8, s=25)
plt.xticks([-6, 0, 6], fontsize=20)
plt.yticks([-6, 0, 6], fontsize=20)
plt.xlabel(r'$\beta_\mathrm{Aging}$', fontsize=20).set_path_effects([path_effects.Normal()])
plt.ylabel(r'$\beta_\mathrm{Genotype}$',
fontsize=20).set_path_effects([path_effects.Normal()])
title = r'\emph{fog-2} phenocopies early aging in \emph{C. elegans}'
plt.title(title).set_path_effects([path_effects.Normal()])
for i, label in enumerate(ax.get_xticklabels()):
ax.get_xticklabels()[i].set_path_effects([path_effects.Normal()])
for i, label in enumerate(ax.get_yticklabels()):
ax.get_yticklabels()[i].set_path_effects([path_effects.Normal()])
bar = plt.colorbar(plot)
bar.ax.set_title('$-log_{10}(q)$')
bar.set_ticks([2, 10, 20])
plt.savefig('../output/fog2phenocopiesaging.svg', bbox_inches='tight')
In [50]:
def f(B, x):
return B*x
linear = odr.Model(f)
sx = np.sqrt(dfBetaA[sig].se_b**2 + dfBetaG[sig].se_b**2)
mydata = odr.RealData(dfBetaA[sig].b + dfBetaG[sig].b, dfBetaAG[sig].b, sx=sx, sy=dfBetaAG[sig].se_b)
myodr = odr.ODR(mydata, linear, beta0=[0])
myoutput = myodr.run()
In [51]:
myoutput.pprint()
In [52]:
def plot_epistasis_regression(X, slope, **kwargs):
"""Plot the ODR line."""
# find the xmin and xmax:
xmin = X.min()
xmax = X.max()
x = np.linspace(xmin - 0.1, xmax + 0.1, 1000)
y0 = x*slope
# plot the models
plt.plot(x, y0, **kwargs)
def epiplot(X, Y, Y_se, **kwargs):
"""Given two arrays, X and Y, plot the points."""
plot_unbranched = kwargs.pop('plot_unbranched', False)
beta = kwargs.pop('beta', np.nan)
s0 = kwargs.pop('s0', 15)
# Calculate the point density
points = np.vstack([X, Y])
z = gaussian_kde(points)(points)
# plot:
fig, ax = plt.subplots()
if len(X) > 50:
plot = plt.scatter(X, Y, c=z, s=s0/Y_se,
edgecolor='', cmap='viridis', alpha=0.5)
else:
plot = plt.scatter(X, Y, s=s0/np.sqrt(Y_se),
color='#33a02c', alpha=.9)
if plot_unbranched:
smoothX = np.linspace(X.min() - 0.5, X.max() + 0.5, 1000)
plt.plot(smoothX, -1/2*smoothX, color='#1f78b4', ls='--',
label='Unbranched Pathway')
if beta:
plot_epistasis_regression(X, beta, ls='-', lw=2.3,
color='#33a02c', label='fit')
plt.xlabel(r'Predicted Additive Effect')
plt.ylabel(r'Deviation from Additive Effect')
bar = plt.colorbar(plot)
bar.ax.set_title('Density')
bar.set_ticks([z.min(), z.max()])
bar.set_ticklabels(['Low', 'High'])
plt.legend()
return ax
In [53]:
ax = epiplot(dfBetaA[sig].b + dfBetaG[sig].b, dfBetaAG[sig].b, dfBetaAG[sig].se_b, beta=myoutput.beta)
plt.savefig('../output/epistasis_plot_age_genotype.svg')
In [54]:
_ = tea.enrichment_analysis(dfBetaAG[sig].ens_gene.unique(), phenotype_df)
tea.plot_enrichment_results(_)
Out[54]:
In [ ]: